# Do not run this script. The raw datasets loaded by it contain personal
# information and are not included in this package.

# It contains the code, which takes the raw datasets on queries sent and
# responses received and prepares the anonymized replication dataset.

rm(list=ls())

library(tidyverse)
library(readxl)

# Prepare the dataset and anonymize it for this replication package  {{{1

# Load dataset of sent queries (not included)
load('../Data/data_full.RData')

# Load dataset of responses (not included)
emails.split <- 
  read_xlsx('../emails_merged/emails_split_answers.xlsx')

# Put the data together, clean, and recode {{{1

rdata <- data_full %>%
  left_join(emails.split) %>%
  # Remove mismatches
  filter(time_to_reply > 0 | is.na(time_to_reply)) %>% 
  # Create outcome variable
  mutate(response = !(is.na(reply_to) | 
                        merge_kind == 'automatic reply' |
                        (!is.na(reply_subject) & 
                           str_detect(reply_subject, '^FW') &
                           str_detect(reply_content, 
         #Anonymized
                        )
  )
  ) %>%
  group_by(ID_mail) %>%
  add_tally(name = 'distinct_reply') %>%
  arrange(ID_mail, desc(response), reply_time, .by_group = TRUE) %>%
  slice(1) %>%
  ungroup %>%
  mutate(marked_spam = str_detect(reply_subject, 'SPAM'),
         forwarded   = merge_kind == 'someone else replied',
         automat   = merge_kind == 'automatic reply',
         reply_nwords = str_count(reply_text, "\\S+"),
         name_greet = str_detect(reply_greet, 'svobod|pospis|lakat|gazi')
  ) %>%
  select(ID_mail, response, ethnicity, literacy, email, letter, location,
         time_to_reply, sender_name,
         merge_kind, reply_subject, marked_spam, forwarded, automat,
         reply_nwords, name_greet, sender_mail, email_text,
         distinct_reply, sent, reply_content) %>%
  arrange(email, letter, sender_name)

# Add female recipient dummy using names (anonymized)
males <- c('@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz',
           '@uradprace.cz')
rdata <- rdata %>% 
  mutate(female = !email %in% males)

# Add cities

rdata <- rdata %>% 
  mutate(prague = str_detect(location, 'Praha')) %>% 
  mutate(city100k = 
         str_detect(location, 'Brno|Ostrava|Plze|Liberec|Olomouc')) %>% 
  mutate(city50k = 
         str_detect(location, 'České Budějovice|Ústí nad Labem|Hradec Králov|Pardubice|Zlín|Havířov|Kladno|Most|Opava|Frýdek Místek|Karviná|Jihlava'))

# Load the data on excluded locations and merge with the main dataset
excl.locs <- 
  read_csv('Data/Excluded_locations_and_census_population_by_district.csv') %>%
  select(2:6) %>%
  set_names('location', 'nlocations', 'people.lb', 'people.ub', 
            'population') %>% 
  mutate(excluded.share = ifelse(nlocations == 0, 
                                 0,
                                 100 * ((people.lb + people.ub) / 2) / 
                                   population)
  ) 

rdata <- rdata %>% 
  left_join(excl.locs %>% select(location, excluded.share))

# Anonymize and save the replication dataset {{{2

rdata <- 
  rdata %>% 
  mutate(recipient.id = rep(1:457, each = 3)) %>%
  # Drop variables containing personal information
  select(-email, -location, -reply_content) %>%
  arrange(recipient.id, ID_mail) %>%
  select(recipient.id, ID_mail, 
         sender_name, ethnicity, letter, literacy,
         response, 
         female, prague, city100k, city50k,
         time_to_reply, distinct_reply, name_greet, reply_nwords, marked_spam,
         forwarded, automat, sent, excluded.share)

write_csv(rdata, 'Data/Replication_dataset.csv')

